In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt 
import seaborn as sns
import os
In [2]:
df = pd.read_csv("Amazon-Products.csv", low_memory=False, index_col = 0)
In [3]:
df.head(10)
Out[3]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price
0 Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... appliances Air Conditioners https://m.media-amazon.com/images/I/31UISB90sY... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.2 2,255 ₹32,999 ₹58,990
1 LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.2 2,948 ₹46,490 ₹75,990
2 LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Inverter-Convertible-... 4.2 1,206 ₹34,490 ₹61,990
3 LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.0 69 ₹37,990 ₹68,990
4 Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... appliances Air Conditioners https://m.media-amazon.com/images/I/41lrtqXPiW... https://www.amazon.in/Carrier-Inverter-Split-C... 4.1 630 ₹34,490 ₹67,790
5 Voltas 1.4 Ton 3 Star Inverter Split AC(Copper... appliances Air Conditioners https://m.media-amazon.com/images/I/41TuyxwZ9m... https://www.amazon.in/Voltas-Adjustable-173V-V... 4.0 1,666 ₹31,990 ₹70,990
6 Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1... appliances Air Conditioners https://m.media-amazon.com/images/I/31IXlxIPsO... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.2 1,097 ₹29,999 ₹49,990
7 Lloyd 1.5 Ton 5 Star Inverter Split Ac (5 In 1... appliances Air Conditioners https://m.media-amazon.com/images/I/31IXlxIPsO... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.3 1,494 ₹39,990 ₹67,990
8 Carrier 1 Ton 3 Star AI Flexicool Inverter Spl... appliances Air Conditioners https://m.media-amazon.com/images/I/51sTXvsanQ... https://www.amazon.in/Carrier-Flexicool-Invert... 4.1 674 ₹30,990 ₹58,190
9 Voltas 1.5 Ton, 5 Star, Inverter Split AC(Copp... appliances Air Conditioners https://m.media-amazon.com/images/I/51WQ3nWF0v... https://www.amazon.in/Voltas-Inverter-Split-Co... 4.0 801 ₹37,999 ₹73,990
In [4]:
df.describe()
Out[4]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price
count 551585 551585 551585 551585 551585 375791 375791 490422 533772
unique 396210 20 112 462414 551585 49 8342 27511 23170
top Zeya Yellow Gold Ring accessories Shirts https://m.media-amazon.com/images/I/51uEPldT42... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.0 1 ₹499 ₹999
freq 718 116141 19200 3044 1 36609 39816 18248 48774
In [5]:
df.shape
Out[5]:
(551585, 9)
In [6]:
df.isnull().sum()
Out[6]:
name                   0
main_category          0
sub_category           0
image                  0
link                   0
ratings           175794
no_of_ratings     175794
discount_price     61163
actual_price       17813
dtype: int64
In [7]:
df.columns
Out[7]:
Index(['name', 'main_category', 'sub_category', 'image', 'link', 'ratings',
       'no_of_ratings', 'discount_price', 'actual_price'],
      dtype='object')
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 551585 entries, 0 to 1103
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            551585 non-null  object
 1   main_category   551585 non-null  object
 2   sub_category    551585 non-null  object
 3   image           551585 non-null  object
 4   link            551585 non-null  object
 5   ratings         375791 non-null  object
 6   no_of_ratings   375791 non-null  object
 7   discount_price  490422 non-null  object
 8   actual_price    533772 non-null  object
dtypes: object(9)
memory usage: 42.1+ MB

📂 Clean numeric columns and set data types¶

The columns actual_price, discount_price, no_of_ratings and ratings have wrong datatype. The datatype given is object but we want them to be int or float. Let us correct it. The column amazon_category_and_sub_category has multiple values. In order to clean the data we will seperate them to individual columns.

In [9]:
# Removing the ₹ sign in discount_price and actual_price
df['discount_price'] = df['discount_price'].str.replace('₹','')
In [10]:
df['discount_price']
Out[10]:
0       32,999
1       46,490
2       34,490
3       37,990
4       34,490
         ...  
1099     3,449
1100     1,199
1101     1,199
1102       NaN
1103     1,039
Name: discount_price, Length: 551585, dtype: object
In [11]:
df['actual_price'] = df['actual_price'].str.replace('₹','')
In [12]:
df['actual_price']
Out[12]:
0       58,990
1       75,990
2       61,990
3       68,990
4       67,790
         ...  
1099     4,599
1100     1,999
1101     1,999
1102       NaN
1103     1,299
Name: actual_price, Length: 551585, dtype: object

Exchange Indian Currency to USD¶

In [13]:
df['discount_price'] = df['discount_price'].str.replace(",", "").astype(float)
df['actual_price'] = df['actual_price'].str.replace(",", "").astype(float)
In [14]:
df['discount_price']
Out[14]:
0       32999.0
1       46490.0
2       34490.0
3       37990.0
4       34490.0
         ...   
1099     3449.0
1100     1199.0
1101     1199.0
1102        NaN
1103     1039.0
Name: discount_price, Length: 551585, dtype: float64
In [15]:
df['discount_price'] = df['discount_price'] * 0.01223
In [16]:
df['actual_price'] = df['actual_price'] * 0.01223
In [17]:
df['discount_price']
Out[17]:
0       403.57777
1       568.57270
2       421.81270
3       464.61770
4       421.81270
          ...    
1099     42.18127
1100     14.66377
1101     14.66377
1102          NaN
1103     12.70697
Name: discount_price, Length: 551585, dtype: float64
In [18]:
df['ratings'].unique()
Out[18]:
array(['4.2', '4.0', '4.1', '4.3', '3.9', '3.8', '3.5', nan, '4.6', '3.3',
       '3.4', '3.7', '2.9', '5.0', '4.4', '3.6', '2.7', '4.5', '3.0',
       '3.1', '3.2', '4.8', '4.7', '2.5', '1.0', '2.6', '2.8', '2.3',
       '1.7', 'Get', '1.8', '2.4', '4.9', '2.2', '1.6', '1.9', '2.0',
       '1.4', '2.1', 'FREE', '1.2', '1.3', '1.5', '₹68.99', '₹65', '1.1',
       '₹70', '₹100', '₹99', '₹2.99'], dtype=object)
In [19]:
# Extract the digits and change the type to float
df['ratings'] = df['ratings'].replace(['Get','FREE','₹68.99', '₹65','₹70', '₹100', '₹99', '₹2.99'], '0.0')
df['ratings'] = df["ratings"].astype(float)
df['ratings'].unique()
Out[19]:
array([4.2, 4. , 4.1, 4.3, 3.9, 3.8, 3.5, nan, 4.6, 3.3, 3.4, 3.7, 2.9,
       5. , 4.4, 3.6, 2.7, 4.5, 3. , 3.1, 3.2, 4.8, 4.7, 2.5, 1. , 2.6,
       2.8, 2.3, 1.7, 0. , 1.8, 2.4, 4.9, 2.2, 1.6, 1.9, 2. , 1.4, 2.1,
       1.2, 1.3, 1.5, 1.1])

➡️ Preprocess Rating column¶

The 'no_of_ratings' column is converted to the float type in two steps: first, a new boolean column is formed, where the True value corresponds to the numeric value in the original column. Then the values ​​in the 'no_of_ratings' column are recalculated for values ​​matching the True of the 'correct_no_of_ratings' column

In [20]:
# Add column 'correct_no_of_ratings' which value is 'True' if 'no_of_ratings' begins from digit
df['no_of_ratings'] = df['no_of_ratings'].astype(str)
df['correct_no_of_ratings'] = df['no_of_ratings'].apply(lambda x: x[0].isdigit())
In [21]:
# Drop columns with incorrect 'no_of_ratings'
df = df[df['correct_no_of_ratings'] == True]
df['correct_no_of_ratings'].value_counts()
Out[21]:
True    369558
Name: correct_no_of_ratings, dtype: int64
In [22]:
#Change type to float
df['no_of_ratings'] = df['no_of_ratings'].str.replace(",","").astype(float)
In [23]:
df.head(10)
Out[23]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price correct_no_of_ratings
0 Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... appliances Air Conditioners https://m.media-amazon.com/images/I/31UISB90sY... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.2 2255.0 403.57777 721.4477 True
1 LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.2 2948.0 568.57270 929.3577 True
2 LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Inverter-Convertible-... 4.2 1206.0 421.81270 758.1377 True
3 LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.0 69.0 464.61770 843.7477 True
4 Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... appliances Air Conditioners https://m.media-amazon.com/images/I/41lrtqXPiW... https://www.amazon.in/Carrier-Inverter-Split-C... 4.1 630.0 421.81270 829.0717 True
5 Voltas 1.4 Ton 3 Star Inverter Split AC(Copper... appliances Air Conditioners https://m.media-amazon.com/images/I/41TuyxwZ9m... https://www.amazon.in/Voltas-Adjustable-173V-V... 4.0 1666.0 391.23770 868.2077 True
6 Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1... appliances Air Conditioners https://m.media-amazon.com/images/I/31IXlxIPsO... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.2 1097.0 366.88777 611.3777 True
7 Lloyd 1.5 Ton 5 Star Inverter Split Ac (5 In 1... appliances Air Conditioners https://m.media-amazon.com/images/I/31IXlxIPsO... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.3 1494.0 489.07770 831.5177 True
8 Carrier 1 Ton 3 Star AI Flexicool Inverter Spl... appliances Air Conditioners https://m.media-amazon.com/images/I/51sTXvsanQ... https://www.amazon.in/Carrier-Flexicool-Invert... 4.1 674.0 379.00770 711.6637 True
9 Voltas 1.5 Ton, 5 Star, Inverter Split AC(Copp... appliances Air Conditioners https://m.media-amazon.com/images/I/51WQ3nWF0v... https://www.amazon.in/Voltas-Inverter-Split-Co... 4.0 801.0 464.72777 904.8977 True
In [24]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 369558 entries, 0 to 1103
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   name                   369558 non-null  object 
 1   main_category          369558 non-null  object 
 2   sub_category           369558 non-null  object 
 3   image                  369558 non-null  object 
 4   link                   369558 non-null  object 
 5   ratings                369558 non-null  float64
 6   no_of_ratings          369558 non-null  float64
 7   discount_price         334963 non-null  float64
 8   actual_price           362797 non-null  float64
 9   correct_no_of_ratings  369558 non-null  bool   
dtypes: bool(1), float64(4), object(5)
memory usage: 28.5+ MB
In [25]:
plt.figure(figsize=(10,6))
sns.displot(
    data=df.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
    aspect=1.25
)
Out[25]:
<seaborn.axisgrid.FacetGrid at 0x20c84cf7fa0>
<Figure size 1000x600 with 0 Axes>
In [26]:
# Calculate the percentage of missing values in each column
missing_data = df.isnull().sum()

# Create a bar chart to visualize the percentage of missing values
plt.figure(figsize=(10, 5))
plt.bar(missing_data.index, missing_data)
plt.xticks(rotation=90)
plt.ylabel('Percentage of missing values')
plt.show()

🗄️ EDA¶

Now let us imagine we are browsing the Amazon website. What are the things that you see when you click on a product. For me the priority order is as follows:

  1. Price
  2. Rating
  3. Manufacturer
  4. Description
  5. Customer reviews
    Let us see analyze the given dataframe on following points.
In [27]:
df = df.dropna(subset=['actual_price','discount_price'])
df.head()
Out[27]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price correct_no_of_ratings
0 Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... appliances Air Conditioners https://m.media-amazon.com/images/I/31UISB90sY... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.2 2255.0 403.57777 721.4477 True
1 LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.2 2948.0 568.57270 929.3577 True
2 LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Inverter-Convertible-... 4.2 1206.0 421.81270 758.1377 True
3 LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.0 69.0 464.61770 843.7477 True
4 Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... appliances Air Conditioners https://m.media-amazon.com/images/I/41lrtqXPiW... https://www.amazon.in/Carrier-Inverter-Split-C... 4.1 630.0 421.81270 829.0717 True

➡️ Extract the manufacturer from the 'name' column¶

In [28]:
df['manufacturer'] = df['name'].str.split(' ').str[0]
In [29]:
cols = df.columns.tolist()
cols
Out[29]:
['name',
 'main_category',
 'sub_category',
 'image',
 'link',
 'ratings',
 'no_of_ratings',
 'discount_price',
 'actual_price',
 'correct_no_of_ratings',
 'manufacturer']
In [30]:
df = df[cols]
df.head()
Out[30]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price correct_no_of_ratings manufacturer
0 Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... appliances Air Conditioners https://m.media-amazon.com/images/I/31UISB90sY... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.2 2255.0 403.57777 721.4477 True Lloyd
1 LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.2 2948.0 568.57270 929.3577 True LG
2 LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Inverter-Convertible-... 4.2 1206.0 421.81270 758.1377 True LG
3 LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.0 69.0 464.61770 843.7477 True LG
4 Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... appliances Air Conditioners https://m.media-amazon.com/images/I/41lrtqXPiW... https://www.amazon.in/Carrier-Inverter-Split-C... 4.1 630.0 421.81270 829.0717 True Carrier
In [31]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 334963 entries, 0 to 1103
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   name                   334963 non-null  object 
 1   main_category          334963 non-null  object 
 2   sub_category           334963 non-null  object 
 3   image                  334963 non-null  object 
 4   link                   334963 non-null  object 
 5   ratings                334963 non-null  float64
 6   no_of_ratings          334963 non-null  float64
 7   discount_price         334963 non-null  float64
 8   actual_price           334963 non-null  float64
 9   correct_no_of_ratings  334963 non-null  bool   
 10  manufacturer           334963 non-null  object 
dtypes: bool(1), float64(4), object(6)
memory usage: 28.4+ MB
In [32]:
#Caculating discount net value and percentage
df['discount_value'] = df['actual_price'] - df['discount_price']
df['discount_percentage'] = 1 - df['discount_price']/df['actual_price']
In [33]:
df.head()
Out[33]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price correct_no_of_ratings manufacturer discount_value discount_percentage
0 Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... appliances Air Conditioners https://m.media-amazon.com/images/I/31UISB90sY... https://www.amazon.in/Lloyd-Inverter-Convertib... 4.2 2255.0 403.57777 721.4477 True Lloyd 317.86993 0.440600
1 LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.2 2948.0 568.57270 929.3577 True LG 360.78500 0.388209
2 LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Inverter-Convertible-... 4.2 1206.0 421.81270 758.1377 True LG 336.32500 0.443620
3 LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C... appliances Air Conditioners https://m.media-amazon.com/images/I/51JFb7FctD... https://www.amazon.in/LG-Convertible-Anti-Viru... 4.0 69.0 464.61770 843.7477 True LG 379.13000 0.449340
4 Carrier 1.5 Ton 3 Star Inverter Split AC (Copp... appliances Air Conditioners https://m.media-amazon.com/images/I/41lrtqXPiW... https://www.amazon.in/Carrier-Inverter-Split-C... 4.1 630.0 421.81270 829.0717 True Carrier 407.25900 0.491223
In [34]:
# Detail of the maximum price row
df[df['actual_price'] == df['actual_price'].max()]
Out[34]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price correct_no_of_ratings manufacturer discount_value discount_percentage
853 Marutivilla Insect Killer, Mosquito Killer Lig... home & kitchen Garden & Outdoors https://m.media-amazon.com/images/W/IMAGERENDE... https://www.amazon.in/Marutivilla-Mosquito-Suc... 2.0 1.0 7.20347 1.210770e+08 True Marutivilla 1.210770e+08 1.0
In [35]:
# Detail of the minimum price row
df[df["discount_value"] == df["discount_value"].min()]
Out[35]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price correct_no_of_ratings manufacturer discount_value discount_percentage
580 GKFML Fiber Glass 30 METRE FIBER PLASTIKA MEAS... industrial supplies Test, Measure & Inspect https://m.media-amazon.com/images/I/41kdufmMNA... https://www.amazon.in/GKFML-Fiber-Glass-PLASTI... 3.3 8.0 7.215578 7.2157 True GKFML 0.000122 0.000017
In [36]:
values = df["manufacturer"].value_counts().keys().tolist()[:10]
counts = df["manufacturer"].value_counts().tolist()[:10]
In [37]:
fig = px.bar(df, y = counts, x = values,
            color_discrete_sequence = ["#EC2781"] * len(df))


fig.update_layout(
                 plot_bgcolor = "#ECECEC",
                  yaxis_title = "Count",
                xaxis_title = "Name of Manufacturers",
                  title = "<b>Popular Manufacturers Category</b>"
                 )
fig.show()

✔️ Insight 1¶

From above graph we see that the Puma is most popular. Let us check the main category for the above top 10 brands

In [38]:
#Top 10 manufacturer
df_list = []
for i in values:
    x = df[df['manufacturer'] == i]
    df_list.append(x)
frame = pd.concat(df_list)
frame.head()
Out[38]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price correct_no_of_ratings manufacturer discount_value discount_percentage
107 Puma polyester 23 Cms Gym Bag(7572229_Pink_X_Red) sports & fitness All Exercise & Fitness https://m.media-amazon.com/images/W/IMAGERENDE... https://www.amazon.in/PUMA-Polyester-Bridal-Ro... 4.1 249.0 9.64947 18.33277 True Puma 8.68330 0.473649
608 Puma Women's Boyfriend Leggings sports & fitness All Exercise & Fitness https://m.media-amazon.com/images/I/41SJp8rVXL... https://www.amazon.in/Puma-Womens-Boyfriend-Le... 4.4 12.0 6.92218 24.44777 True Puma 17.52559 0.716858
798 Puma Women Track Pants sports & fitness All Exercise & Fitness https://m.media-amazon.com/images/W/IMAGERENDE... https://www.amazon.in/Puma-Womens-Regular-6755... 4.4 5.0 20.16727 36.67777 True Puma 16.51050 0.450150
985 Puma Men Pants sports & fitness All Exercise & Fitness https://m.media-amazon.com/images/W/IMAGERENDE... https://www.amazon.in/Puma-Worldwide-Graphic-M... 3.7 12.0 13.57530 36.67777 True Puma 23.10247 0.629877
1073 Puma Men's Plain Socks sports & fitness All Exercise & Fitness https://m.media-amazon.com/images/W/IMAGERENDE... https://www.amazon.in/Puma-Plain-Socks-4055263... 4.4 11.0 4.14597 12.21777 True Puma 8.07180 0.660661
In [39]:
# Average rating of the manufactures
frame[['manufacturer', 'ratings']].groupby("manufacturer").mean().sort_values(by = 'ratings',
                                                                               ascending = False)
Out[39]:
ratings
manufacturer
The 3.948179
Van 3.944422
Amazon 3.940362
Puma 3.914077
Levi's 3.855082
U.S. 3.823259
Campus 3.812296
Red 3.731996
Pepe 3.713827
Clovia 3.700071

✔️ Insight 2¶

Even though the most popular brand is Puma but the highest rated is Amazon. Also the manufacturer Amazon is second most favourite in the popular manufacturer category. On the other hand Puma is second in terms of average rating

In [40]:
frame['main_category'].unique()
Out[40]:
array(['sports & fitness', 'stores', "kids' fashion", 'bags & luggage',
       'accessories', 'car & motorbike', "men's shoes",
       "women's clothing", "women's shoes", "men's clothing",
       'appliances', 'tv, audio & cameras', 'grocery & gourmet foods',
       'home & kitchen', 'pet supplies', 'toys & baby products',
       'beauty & health', 'industrial supplies', 'music'], dtype=object)
In [41]:
plt.figure(figsize=(8,6))
sns.countplot(x=frame['main_category'])
plt.xlabel('Main Categories')
plt.ylabel('Count')
plt.title('Count of Main Categories of Product')
plt.xticks(rotation=90)
plt.show()

✔️ Insight 3¶

"Men's Clothing" is the most popular category. Let's find top 10 popular main category

In [42]:
value_main = frame["main_category"].value_counts().keys().tolist()[:5]
count_main = frame["main_category"].value_counts().tolist()[:5]
value_main
Out[42]:
["men's clothing", "men's shoes", "women's clothing", 'stores', 'accessories']
In [43]:
df_list = []
for i in value_main:
    x = frame[frame['main_category'] == i]
    df_list.append(x)
frame = pd.concat(df_list)
In [44]:
frame.head()
Out[44]:
name main_category sub_category image link ratings no_of_ratings discount_price actual_price correct_no_of_ratings manufacturer discount_value discount_percentage
1140 Puma Men's Regular Fit Vest men's clothing Innerwear https://m.media-amazon.com/images/W/IMAGERENDE... https://www.amazon.in/PUMA-Solid-Regular-67459... 4.4 5.0 6.83657 9.77177 True Puma 2.9352 0.300375
1287 Puma Men's Boxer Shorts (58672906_Blue_XL) men's clothing Innerwear https://m.media-amazon.com/images/I/51MjTF8Oxh... https://www.amazon.in/Puma-Regular-Polyester-U... 4.6 20.0 9.89407 22.00177 True Puma 12.1077 0.550306
1298 Puma Men's Boxer Shorts (58672906_Blue_XL) men's clothing Innerwear https://m.media-amazon.com/images/I/51MjTF8Oxh... https://www.amazon.in/Puma-Regular-Polyester-U... 4.6 20.0 9.89407 22.00177 True Puma 12.1077 0.550306
1343 Puma Men Briefs men's clothing Innerwear https://m.media-amazon.com/images/I/71MBP-u6eb... https://www.amazon.in/Stretch-Brief-Plain-Whit... 3.8 63.0 5.61357 7.32577 True Puma 1.7122 0.233723
1873 Puma Men's Regular Fit Vests men's clothing Innerwear https://m.media-amazon.com/images/I/51WDyvgIkk... https://www.amazon.in/Puma-Mens-Polyester-Vest... 2.6 2.0 10.99477 22.00177 True Puma 11.0070 0.500278
In [45]:
frame_sub = frame[['main_category','sub_category']]
frame_sub.value_counts()
Out[45]:
main_category     sub_category              
men's clothing    T-shirts & Polos              2384
                  Shirts                        2373
men's shoes       Sports Shoes                  2011
                  Casual Shoes                  1633
men's clothing    Jeans                         1535
stores            Men's Fashion                 1404
women's clothing  Lingerie & Nightwear          1288
                  Western Wear                  1126
stores            Sportswear                    1014
women's clothing  Clothing                       867
men's clothing    Innerwear                      673
accessories       Bags & Luggage                 569
men's shoes       Formal Shoes                   410
accessories       Handbags & Clutches            340
women's clothing  Ethnic Wear                    291
accessories       Fashion & Silver Jewellery     161
                  Watches                        141
stores            Amazon Fashion                  79
                  Women's Fashion                 73
accessories       Jewellery                       68
stores            The Designer Boutique            1
dtype: int64

Now we have completed the second phase of data preprocessing. After this we have achieved a dataframe with following characterstics:

No null price. Top 10 manufacturers with respect to count Popular top 5 main categories, 10 sub_category Let us now check the average rating and price for this selected dataframe.

In [46]:
# Rating of the products
print("The average rating: ",frame["ratings"].unique())

# After processing our data we have significantly reduced the size of the dataframe.
# Also the rating are now 4 or greater.
# Let us now check new average price ### check above before processing to compare.
print("The average price: ", frame["actual_price"].mean())
The average rating:  [4.4 4.6 3.8 2.6 3.5 3.2 1.  4.  4.2 4.1 4.3 3.  5.  3.4 4.5 3.9 3.7 3.6
 2.5 2.9 2.3 3.3 2.8 3.1 2.  4.7 2.1 4.8 1.5 2.4 2.7 4.9 2.2 1.6 1.8 1.4
 1.9 1.7 1.3]
The average price:  32.751743428660546
In [47]:
import matplotlib.pyplot as plt

# Rating of the products
ratings = frame["ratings"].unique()
plt.figure(figsize=(8, 4))
plt.bar(range(len(ratings)), ratings)
plt.xlabel("Count")
plt.ylabel("Rating")
plt.title("Distribution of Ratings")
plt.show()

Insights:¶

The outlier in price data are around 5.32%
Puma and Amazon are the most popular manufactures with outlier price
The maximum number rating of popular brands is in range of 4 star
Mostly 0-49 review were given on the products